Importing Libraries¶

In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve

Reading the CSV file and checking for missing values, data types, and data descripton of the dataset¶

In [2]:
data = pd.read_csv("transaction_anomalies_dataset.csv")
data.head()
Out[2]:
Transaction_ID Transaction_Amount Transaction_Volume Average_Transaction_Amount Frequency_of_Transactions Time_Since_Last_Transaction Day_of_Week Time_of_Day Age Gender Income Account_Type
0 TX0 1024.835708 3 997.234714 12 29 Friday 6:00 36 Male 1436074 Savings
1 TX1 1013.952065 4 1020.210306 7 22 Friday 1:00 41 Female 627069 Savings
2 TX2 970.956093 1 989.496604 5 12 Tuesday 21:00 61 Male 786232 Savings
3 TX3 1040.822254 2 969.522480 16 28 Sunday 14:00 61 Male 619030 Savings
4 TX4 998.777241 1 1007.111026 7 7 Friday 8:00 56 Female 649457 Savings
In [3]:
data.isnull().sum()
Out[3]:
Transaction_ID                 0
Transaction_Amount             0
Transaction_Volume             0
Average_Transaction_Amount     0
Frequency_of_Transactions      0
Time_Since_Last_Transaction    0
Day_of_Week                    0
Time_of_Day                    0
Age                            0
Gender                         0
Income                         0
Account_Type                   0
dtype: int64
In [4]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Transaction_ID               1000 non-null   object 
 1   Transaction_Amount           1000 non-null   float64
 2   Transaction_Volume           1000 non-null   int64  
 3   Average_Transaction_Amount   1000 non-null   float64
 4   Frequency_of_Transactions    1000 non-null   int64  
 5   Time_Since_Last_Transaction  1000 non-null   int64  
 6   Day_of_Week                  1000 non-null   object 
 7   Time_of_Day                  1000 non-null   object 
 8   Age                          1000 non-null   int64  
 9   Gender                       1000 non-null   object 
 10  Income                       1000 non-null   int64  
 11  Account_Type                 1000 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 93.9+ KB
In [5]:
data.describe()
Out[5]:
Transaction_Amount Transaction_Volume Average_Transaction_Amount Frequency_of_Transactions Time_Since_Last_Transaction Age Income
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1.000000e+03
mean 1038.122511 2.498000 1000.682506 12.078000 15.341000 40.641000 8.948238e+05
std 283.580055 1.115006 20.632334 4.245225 8.361258 13.819953 3.453562e+05
min 849.024392 1.000000 939.081423 5.000000 1.000000 18.000000 3.001590e+05
25% 966.028796 1.000000 986.800556 8.000000 8.000000 29.000000 5.917308e+05
50% 1002.118678 3.000000 1000.501903 12.000000 16.000000 41.000000 8.876645e+05
75% 1033.143657 3.000000 1015.155595 16.000000 22.000000 53.000000 1.178102e+06
max 3227.459108 4.000000 1073.154036 19.000000 29.000000 64.000000 1.499670e+06

Visualizing various charts to get insights from the Data (Histogram, Boxplot, Scatterplot, Barchart, Heatmap)¶

In [6]:
# Distribution of Transaction (Histogram)
dist_transaction = px.histogram(data, x = 'Transaction_Amount', nbins = 20, title = 'Distribution of Transaction Amount')
dist_transaction.update_layout(width = 1000, height = 600, xaxis_title = 'Transaction Amount', yaxis_title = 'Frequency')
dist_transaction.show()
In [7]:
# Transaction Amount by Account type (Box plot)
transaction_acc_type = px.box(data, x = 'Account_Type', y = 'Transaction_Amount', title = 'Transaction Amount by Account Type')
transaction_acc_type.update_layout(width = 1000, height = 600, xaxis_title = 'Transaction Amount', yaxis_title = 'Account Type')
transaction_acc_type.show()
In [8]:
# Average transaction amount vs. age (Scatter plot)
avg_amount_age = px.scatter(data, x = 'Age', y = 'Average_Transaction_Amount', color = 'Account_Type',
                            title = 'Average Transaction Amount vs. Age', trendline = 'ols')
avg_amount_age.update_layout(width = 1000, height = 600, yaxis_title = 'Average Transaction Amount', xaxis_title = 'Age')
avg_amount_age.show()
In [9]:
# Count of Transaction by Day of the Week (Bar chart)
day_of_week = px.bar(data, x = 'Day_of_Week', title = 'Frequency of Transactions by Day of the Week')
day_of_week.update_layout(width = 1000, height = 600, yaxis_title = 'Frequency', xaxis_title = 'Day of Week')
day_of_week.show()
In [10]:
# Heatmap of features
numeric_data = data.select_dtypes(include=['number'])
correlation_matrix = numeric_data.corr()
fig_corr_heatmap = go.Figure(data = go.Heatmap(z = correlation_matrix.values, x = correlation_matrix.columns, y = correlation_matrix.index))
fig_corr_heatmap.update_layout(title = 'Correlation Heatmap', height = 600)
fig_corr_heatmap.show()

Statistical Method: Z-Score¶

In [11]:
# Calculate mean and standard deviation of Transaction Amount
mean_amount = data['Transaction_Amount'].mean()
std_amount = data['Transaction_Amount'].std()

# Define the anomaly threshold (2 standard deviations from the mean)
anomaly_threshold = mean_amount + 2 * std_amount

# Flag anomalies
data['Is_Anomaly'] = data['Transaction_Amount'] > anomaly_threshold

# Scatter plot of Transaction Amount with anomalies highlighted
anomalies = px.scatter(data, x = 'Transaction_Amount', y = 'Average_Transaction_Amount',color = 'Is_Anomaly', 
                           title = 'Anomalies in Transaction Amount')
anomalies.update_traces(marker = dict(size = 12), selector = dict(mode = 'markers', marker_size = 1))
anomalies.update_layout(height = 600, xaxis_title = 'Transaction Amount', yaxis_title = 'Average Transaction Amount')
anomalies.show()

Applying Various Machine Learning Models¶

Machine Learning Model 1: Isolation Forest¶

In [12]:
# Calculate the number of anomalies
num_anomalies = data['Is_Anomaly'].sum()

# Calculate the total number of instances in the dataset
total_instances = data.shape[0]

# Calculate the ratio of anomalies
anomaly_ratio = num_anomalies / total_instances
print(anomaly_ratio)
0.02
In [13]:
# Isolation forest
relevant_features = ['Transaction_Amount', 'Average_Transaction_Amount', 'Frequency_of_Transactions']

# Split data into features (X) and target variable (y)
X = data[relevant_features]
y = data['Is_Anomaly']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Train the Isolation Forest model
model = IsolationForest(contamination = 0.02, random_state = 42)
model.fit(X_train)
/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning:

X does not have valid feature names, but IsolationForest was fitted with feature names

Out[13]:
IsolationForest(contamination=0.02, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
IsolationForest(contamination=0.02, random_state=42)
In [14]:
# Predict anomalies on the test set
y_pred = model.predict(X_test)

# Convert predictions to binary values (0: normal, 1: anomaly)
y_pred_binary = [1 if pred == -1 else 0 for pred in y_pred]

# Evaluate the model's performance
report = classification_report(y_test, y_pred_binary, target_names=['Normal', 'Anomaly'])
print(report)
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00       196
     Anomaly       1.00      1.00      1.00         4

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

In [15]:
# Relevant features used during training
relevant_features = ['Transaction_Amount', 'Average_Transaction_Amount', 'Frequency_of_Transactions']

# Get user inputs for features
user_inputs = []
for feature in relevant_features:
    user_input = float(input(f"Enter the value for '{feature}': "))
    user_inputs.append(user_input)

# Create a DataFrame from user inputs
user_df = pd.DataFrame([user_inputs], columns=relevant_features)

# Predict anomalies using the model
user_anomaly_pred = model.predict(user_df)

# Convert the prediction to binary value (0: normal, 1: anomaly)
user_anomaly_pred_binary = 1 if user_anomaly_pred == -1 else 0

if user_anomaly_pred_binary == 1:
    print("Anomaly detected: This transaction is flagged as an anomaly.")
else:
    print("No anomaly detected: This transaction is normal.")
Anomaly detected: This transaction is flagged as an anomaly.

Machine Learning Model 2: Random Forest¶

In [16]:
# Assuming 'X' is your feature matrix and 'y' is your target
model = RandomForestClassifier()
model.fit(X_train, y_train)
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_}).sort_values(by = 'Importance', ascending=False)
fig = px.bar(feature_importance, x = 'Importance', y = 'Feature', title = 'Feature Importance')
fig.show()

Machine Learning Model 3: ROC Model¶

In [17]:
# ROC model
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

fig = go.Figure()
fig.add_trace(go.Scatter(x = fpr, y = tpr, mode = 'lines', name = f'AUC = {auc:.2f}'))
fig.update_layout(title = 'ROC Curve', xaxis_title = 'False Positive Rate', yaxis_title = 'True Positive Rate')
fig.show()

Machine Learning Model 4: Autoencoder¶

In [18]:
# Autoencoder 
numeric_data = data.select_dtypes(include=['number'])

scaler = StandardScaler()
numeric_data_scaled = scaler.fit_transform(numeric_data)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(numeric_data_scaled, test_size=0.2, random_state=42)

# Build the autoencoder model
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(14, activation = "relu")(input_layer)
encoder = Dense(7, activation = "relu")(encoder)
decoder = Dense(14, activation = "relu")(encoder)
decoder = Dense(input_dim, activation = "sigmoid")(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Train the autoencoder
history = autoencoder.fit(X_train, X_train, epochs = 50, batch_size = 32, validation_data = (X_test, X_test), verbose = 1)
Epoch 1/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 1.3045 - val_loss: 1.1581
Epoch 2/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 594us/step - loss: 1.1768 - val_loss: 1.0951
Epoch 3/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 692us/step - loss: 1.0958 - val_loss: 1.0241
Epoch 4/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 649us/step - loss: 1.0161 - val_loss: 0.9623
Epoch 5/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 619us/step - loss: 0.9591 - val_loss: 0.9284
Epoch 6/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 604us/step - loss: 1.0238 - val_loss: 0.9104
Epoch 7/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 600us/step - loss: 1.0053 - val_loss: 0.8970
Epoch 8/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 600us/step - loss: 0.9609 - val_loss: 0.8836
Epoch 9/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 633us/step - loss: 0.8687 - val_loss: 0.8692
Epoch 10/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 637us/step - loss: 0.8921 - val_loss: 0.8547
Epoch 11/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 609us/step - loss: 0.8672 - val_loss: 0.8422
Epoch 12/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 634us/step - loss: 0.8456 - val_loss: 0.8293
Epoch 13/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 623us/step - loss: 0.9667 - val_loss: 0.8159
Epoch 14/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 578us/step - loss: 0.8485 - val_loss: 0.8042
Epoch 15/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 609us/step - loss: 0.8028 - val_loss: 0.7917
Epoch 16/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 594us/step - loss: 0.7942 - val_loss: 0.7784
Epoch 17/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 593us/step - loss: 0.7770 - val_loss: 0.7679
Epoch 18/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 612us/step - loss: 0.7797 - val_loss: 0.7567
Epoch 19/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 624us/step - loss: 0.7284 - val_loss: 0.7486
Epoch 20/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 600us/step - loss: 0.7216 - val_loss: 0.7418
Epoch 21/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 622us/step - loss: 0.7559 - val_loss: 0.7365
Epoch 22/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 599us/step - loss: 0.7556 - val_loss: 0.7329
Epoch 23/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 597us/step - loss: 0.7486 - val_loss: 0.7291
Epoch 24/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 635us/step - loss: 0.7522 - val_loss: 0.7258
Epoch 25/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 697us/step - loss: 0.7853 - val_loss: 0.7229
Epoch 26/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 625us/step - loss: 0.7282 - val_loss: 0.7202
Epoch 27/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 625us/step - loss: 0.7295 - val_loss: 0.7177
Epoch 28/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 647us/step - loss: 0.7315 - val_loss: 0.7128
Epoch 29/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 628us/step - loss: 0.6913 - val_loss: 0.7099
Epoch 30/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 629us/step - loss: 0.7376 - val_loss: 0.7044
Epoch 31/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 594us/step - loss: 0.7168 - val_loss: 0.7004
Epoch 32/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 625us/step - loss: 0.6724 - val_loss: 0.6949
Epoch 33/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 594us/step - loss: 0.7314 - val_loss: 0.6881
Epoch 34/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 588us/step - loss: 0.7001 - val_loss: 0.6831
Epoch 35/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 619us/step - loss: 0.6827 - val_loss: 0.6791
Epoch 36/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 608us/step - loss: 0.6784 - val_loss: 0.6753
Epoch 37/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 583us/step - loss: 0.6358 - val_loss: 0.6727
Epoch 38/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 607us/step - loss: 0.6583 - val_loss: 0.6701
Epoch 39/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 618us/step - loss: 0.6502 - val_loss: 0.6646
Epoch 40/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 614us/step - loss: 0.6697 - val_loss: 0.6630
Epoch 41/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 609us/step - loss: 0.6677 - val_loss: 0.6592
Epoch 42/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 615us/step - loss: 0.6578 - val_loss: 0.6574
Epoch 43/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 614us/step - loss: 0.6711 - val_loss: 0.6576
Epoch 44/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 599us/step - loss: 0.6215 - val_loss: 0.6561
Epoch 45/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 614us/step - loss: 0.6640 - val_loss: 0.6535
Epoch 46/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 621us/step - loss: 0.6824 - val_loss: 0.6537
Epoch 47/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 643us/step - loss: 0.6902 - val_loss: 0.6531
Epoch 48/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 595us/step - loss: 0.6397 - val_loss: 0.6521
Epoch 49/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 698us/step - loss: 0.6800 - val_loss: 0.6523
Epoch 50/50
25/25 ━━━━━━━━━━━━━━━━━━━━ 0s 662us/step - loss: 0.6367 - val_loss: 0.6516
In [19]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

# Predict the reconstruction on the test data
X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis = 1)

# Calculate the reconstruction error threshold
threshold = np.percentile(mse, 95)

# Identify anomalies
anomalies = mse > threshold

# Plot the reconstruction error
plt.hist(mse, bins = 50)
plt.axvline(threshold, color = 'r', linestyle = '--')
plt.xlabel('Reconstruction Error')
plt.ylabel('Number of Samples')
plt.show()

# Print number of anomalies detected
print(f'Number of anomalies detected: {np.sum(anomalies)}')
No description has been provided for this image
7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step 
No description has been provided for this image
Number of anomalies detected: 10

Identifying which model is best (Isolation Forest or Autoencoder)¶

In [20]:
# Isolation Forest evaluation
roc_score_if = roc_auc_score(y_test, y_pred_binary)
precision_if, recall_if, _ = precision_recall_curve(y_test, y_pred_binary)

print(f"Isolation Forest ROC-AUC: {roc_score_if}")

# Autoencoder evaluation (requires reconstruction error calculation)
reconstruction_error = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstruction_error, 2), axis=1)
threshold = np.percentile(mse, 95)
y_pred_autoencoder = mse > threshold

roc_score_ae = roc_auc_score(y_test, y_pred_autoencoder)
precision_ae, recall_ae, _ = precision_recall_curve(y_test, y_pred_autoencoder)

print(f"Autoencoder ROC-AUC: {roc_score_ae}")
Isolation Forest ROC-AUC: 1.0
7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 309us/step
Autoencoder ROC-AUC: 0.9846938775510203
In [ ]: